R code for lecture 26

wells <- read.csv("ecol 563/wells.txt")
wells
# logistic regression model
out1 <- glm(cbind(y, n-y)~ land.use + sewer, data=wells, family=binomial)
summary(out1)
 
# tabulate data to show quasi-complete separation
xtabs(cbind(y,n-y)~land.use, data=wells)
 
# refit model to show individual logit means for land use
out1a <- glm(cbind(y, n-y)~ land.use + sewer-1, data=wells, family=binomial)
summary(out1a)
 
# models are the same, just different parameterizations
AIC(out1,out1a)
 
# create binary version of the file for comparison
# function to create binary response
yn.func <- function(x) rep(c(1,0),c(x[1],x[2]-x[1]))
 
# sample calculation
apply(wells[1:4,1:2], 1, yn.func)
 
# assemble data
wells.raw <- data.frame(yn=unlist(apply(wells[,1:2], 1, yn.func)), land.use=rep(wells$land.use,wells$n), sewer=rep(wells$sewer,wells$n))
dim(wells.raw)
wells.raw[1:8,]
 
# fit same model this time with a binary response
out1b <- glm(yn~ land.use + sewer, data=wells.raw, family=binomial)
 
# estimates and standard errors are the same, likelihood is different
summary(out1b)
summary(out1)
coef(out1b)
coef(out1)
 
#### methods for dealing with quasi-complete separation ####
 
# Method 1: Firth regression--uses penalized ML
library(logistf)
out1f <- logistf(yn~ land.use + sewer, data=wells.raw, family=binomial)
out1f
 
# Method 2: treat problem group separately: calculate Clopper-Pearson intervals
# sample run
binom.test(2,76)
# CI for land use=undeveloped
binom.test(0,76)
 
# Method 3: collapse categories
xtabs(y~land.use, data=wells)
# examine proportion of contaminated wells by land use category
xtabs(y~land.use, data=wells)/sum(xtabs(y~land.use, data=wells))
 
# combine undeveloped with agricultural
wells$land.use2 <- factor(ifelse(wells$land.use %in% c('agri','undev'), 'rural', as.character(wells$land.use)))
xtabs(y~land.use2, data=wells)
 
# refit model with new categories
out2 <- glm(cbind(y,n-y)~land.use2+sewer, data=wells, family=binomial)
summary(out2)
 
#### simplifying the model: reduce the number of land use categories
 
# fit model to get separate logits for each land use type
out2b <- glm(cbind(y,n-y)~land.use2+sewer-1, data=wells, family=binomial)
# examine estimates
summary(out2b)
coef(out2b)
 
# display estimates in a dotplot
library(lattice)
# estimates appear to fall into three clusters
dotplot(coef(out2b)[1:9])
# combine five the categories
wells$land.use3 <- factor(ifelse(wells$land.use2 %in% c('inst','recr','resL','resM','trans'), 'mixed', as.character(wells$land.use2)))
 
out2c <- glm(cbind(y,n-y)~land.use3+sewer, data=wells, family=binomial)
 
# compare with previous model using AIC
AIC(out2b, out2c)
# models are nested: use LR test
anova(out2c, out2b, test='Chisq')
 
# try collapsing an additional three land use categories
wells$land.use4 <- factor(ifelse(wells$land.use3 %in% c('resH','comm','indus'), 'high.use', as.character(wells$land.use3)))
out2d <- glm(cbind(y,n-y)~land.use4+sewer, data=wells, family=binomial)
 
# compare all the land use models
AIC(out2b, out2c, out2d)
# LR test with previous model
anova(out2d, out2c, test='Chisq')
summary(out2d)
 
# obtain estimates of individual land use logits
out2e <- glm(cbind(y,n-y)~land.use4+sewer-1, data=wells, family=binomial)
summary(out2e)
# display confidence intervals
confint(out2e)
# predicted logits of individual observations
predict(out2d)
# predicted probabilities of individual observations
predict(out2d, type='response')
fitted(out2d)
 
# obtain mean of each binomial distribution
fitted(out2d)*wells$n
wells$y
 
# goodness of fit
# collect the predicted successes and predicted failures in a matrix
 
Ei <- cbind(fitted(out2d)*wells$n, wells$n-fitted(out2d)*wells$n)
 
# too many predicted counts are small for chi-squared distribution to hold
sum(Ei<=5)/length(Ei)
 
# carry out Pearson test anyway

Oi <- cbind(wells$y, wells$n-wells$y)
sum((Oi-Ei)^2/Ei)
 
# p-value for test
1-pchisq(sum((Oi-Ei)^2/Ei),nrow(wells)-length(coef(out2d)))
 
# residual deviance can be used to check fit too
# (except probably inappropriate here)
summary(out2d)
# p-value for residual deviance
1-pchisq(21.502,16)
R Code for Lecture 26 (Monday, November 26, 2012)